# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# DIMENSION REDUCTION AND SHRINKAGE

# Import necessary libraries
! pip install numpy;
! pip install pandas;
! pip install scikit-learn;
! pip install statsmodels;
! pip install mlxtend;
! pip install ISLP;

import numpy as np
import pandas as pd
import statsmodels.api as sm

# Part I. Variable Selection and Ridge Regression

# Part I: Variable Selection
# 1. Variable Selection Using Exhaustive Search (like regsubsets in R)

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LinearRegression

# Load Boston dataset
from ISLP import load_data;
boston = load_data('boston')
print(boston)

        crim    zn  indus  chas    nox     rm   age     dis  rad  tax  \
0    0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296   
1    0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242   
2    0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242   
3    0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222   
4    0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...  ...   
501  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786    1  273   
502  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875    1  273   
503  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675    1  273   
504  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889    1  273   
505  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050    1  273   

     ptratio  lstat  medv  
0       15.3   4.98  24.0  
1       17.8   9.14  21.6  
2       17.8   4.03  34.7  
3       18.7   2.94  33.4  
4       18.7   5.33  36.2  
..       ...    ...   ...  
501     21.0   9.67  22.4  
502     21.0   9.08  20.6  
503     21.0   5.64  23.9  
504     21.0   6.48  22.0  
505     21.0   7.88  11.9  

[506 rows x 13 columns]

# Define X and y for regression

X_boston = boston[['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']]
y_boston = boston[['medv']]
print(X_boston,y_boston)

# Initialize the linear regression model
lr = LinearRegression()

# Exhaustive feature selection

efs = EFS(lr, 
          min_features=1, 
          max_features=12,  
          scoring='r2',
          print_progress=True,
          cv=5)

efs.fit(X_boston, y_boston)

# Never mind these warnings. It is all about future versions of model_selection

# Best subset of features
best_features = efs.best_feature_names_
print(f"Best features: {best_features}")

# Summarize results
efs_results = pd.DataFrame(efs.get_metric_dict()).T
print(efs_results[['feature_idx', 'avg_score', 'std_dev']].head())

Best features: ('crim', 'zn', 'chas', 'nox', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat')
  feature_idx avg_score   std_dev
0        (0,) -0.319539  0.423008
1        (1,) -0.438576  0.603464
2        (2,)  -0.21937  0.154856
3        (3,)  -0.64991  0.846949
4        (4,) -0.307827   0.34222

# 2. Stepwise Regression

def forward_selection(X, y):
    initial_features = []
    remaining_features = list(X.columns)
    best_features = initial_features[:]
    current_score, best_new_score = float('inf'), float('inf')
    
    while remaining_features:
        scores_with_candidates = []
        for candidate in remaining_features:
            formula = best_features + [candidate]
            X_new = X[formula]
            X_new = sm.add_constant(X_new)
            model = sm.OLS(y, X_new).fit()
            score = model.aic
            scores_with_candidates.append((score, candidate))
        
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates[0]
        
        if current_score > best_new_score:
            remaining_features.remove(best_candidate)
            best_features.append(best_candidate)
            current_score = best_new_score
        else:
            break
    
    formula = best_features
    X_final = X[formula]
    X_final = sm.add_constant(X_final)
    model = sm.OLS(y, X_final).fit()
    return model

model = forward_selection(X_boston, y_boston)
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.734
Model:                            OLS   Adj. R-squared:                  0.729
Method:                 Least Squares   F-statistic:                     136.8
Date:                Fri, 27 Sep 2024   Prob (F-statistic):          1.73e-135
Time:                        13:41:06   Log-Likelihood:                -1505.0
No. Observations:                 506   AIC:                             3032.
Df Residuals:                     495   BIC:                             3078.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         41.4517      4.903      8.454      0.000      31.818      51.086
lstat         -0.5465      0.047    -11.519      0.000      -0.640      -0.453
rm             3.6730      0.409      8.978      0.000       2.869       4.477
ptratio       -0.9310      0.130     -7.138      0.000      -1.187      -0.675
dis           -1.5160      0.188     -8.078      0.000      -1.885      -1.147
nox          -18.2624      3.565     -5.122      0.000     -25.267     -11.258
chas           2.8719      0.863      3.329      0.001       1.177       4.567
zn             0.0462      0.014      3.378      0.001       0.019       0.073
crim          -0.1217      0.033     -3.696      0.000      -0.186      -0.057
rad            0.2839      0.064      4.440      0.000       0.158       0.410
tax           -0.0123      0.003     -3.608      0.000      -0.019      -0.006
==============================================================================
Omnibus:                      172.594   Durbin-Watson:                   1.074
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              725.971
Skew:                           1.486   Prob(JB):                    2.28e-158
Kurtosis:                       8.060   Cond. No.                     1.13e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# Part II: Ridge Regression
# To perform Ridge Regression, you can use the Ridge class from sklearn.linear_model.

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_boston, y_boston, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge regression with different lambdas
alphas = np.linspace(0, 10, 100)
ridge_models = [Ridge(alpha=alpha).fit(X_train_scaled, y_train) for alpha in alphas]

# Plot the coefficients against different alphas
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
for i in range(X_boston.shape[1]):
    plt.plot(alphas, [ridge.coef_[0][i] for ridge in ridge_models], label=X_boston.columns[i])  # Add [0] to access coefficients correctly
plt.xlabel('Lambda')
plt.ylabel('Coefficient values')
plt.title('Ridge Regression Coefficients vs Lambda')
plt.legend()
plt.show()

# Choose the best lambda using cross-validation
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(alphas=np.linspace(0.01, 10, 100), store_cv_values=True)
ridge_cv.fit(X_train_scaled, y_train)

print(f"Best lambda: {ridge_cv.alpha_}")

Best lambda: 7.073636363636363